home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
3D GFX
/
3D GFX.iso
/
amiutils
/
e_h
/
flick
/
src
/
c2p_020.s
< prev
next >
Wrap
Text File
|
1995-12-30
|
7KB
|
309 lines
; Chunky2Planar algorithm, originally by James McCoull
; Modified by Peter McGavin for variable size and depth
; and "dirty list" (hope I didn't slow it down too much)
;
; Cpu only solution
; Optimised for 020+fastram
; Aim for less than 90ms for 320x200x256 on 14MHz 020
;void __asm c2p_8 (register __a0 UBYTE *chunky_data,
; register __a1 PLANEPTR raster,
; register __a2 UBYTE *dirty_list,
; register __d1 ULONG plsiz,
; register __a5 UBYTE *tmp_buffer);
; a0 -> width*height chunky pixels in fastmem
; a1 -> contiguous bitplanes in chipmem
; a2 -> dirty list (1-byte flag for whether each 32 pixel "unit" needs updating)
; d1 = width*height/8 (width*height must be a multiple of 32)
; a5 -> width*height tmp buffer in fastmem
ifeq depth-8
xdef _c2p_8_020
_c2p_8_020:
else
ifeq depth-6
xdef _c2p_6_020
_c2p_6_020:
else
ifeq depth-4
xdef _c2p_4_020
_c2p_4_020:
else
fail "unsupported depth!"
endc
endc
endc
wordmerge macro ; i1 i2 tmp
; \1 \2 \3
move.l \2,\3 ;\3 = CD
move.w \1,\2 ;\2 = CB
swap \2 ;\2 = BC
move.w \2,\1 ;\1 = AC
move.w \3,\2 ;\2 = BD
endm
merge macro ; io in out tmp msk sft
; \1 \2 \3 \4 \5 \6
; \1 = abqr
; \2 = ijyz
move.l \5,\3 ; \3 = 0x0x
move.l \3,\4 ; \4 = 0x0x
and.l \1,\3 ; \3 = 0b0r
and.l \2,\4 ; \4 = 0j0z
eor.l \3,\1 ; \1 = a0q0
eor.l \4,\2 ; \2 = i0y0
lsr.l #\6,\2 ; \2 = 0i0y
ifeq \6-1
add.l \3,\3
else
lsl.l #\6,\3 ; \3 = b0r0
endc
or.l \2,\1 ; \1 = aiqy
or.l \4,\3 ; \3 = bjrz
endm
merge4 macro ; io in out tmp msk
; \1 \2 \3 \4 \5
; \1 = abqr
; \2 = ijyz
ifgt depth-4
move.l \5,\3 ; \3 = 0x0x
move.l \3,\4 ; \4 = 0x0x
and.l \1,\3 ; \3 = 0b0r
and.l \2,\4 ; \4 = 0j0z
eor.l \3,\1 ; \1 = a0q0
eor.l \4,\2 ; \2 = i0y0
lsr.l #4,\2 ; \2 = 0i0y
or.l \2,\1 ; \1 = aiqy
move.l \1,(a5)+ ; write to tmp buffer
lsl.l #4,\3 ; \3 = b0r0
or.l \4,\3 ; \3 = bjrz
move.l \3,(a5)+ ; write to tmp buffer
else
move.l \5,\3 ; this version returns only 1 result
and.l \3,\2 ; \2 = 0j0z
and.l \1,\3 ; \3 = 0b0r
lsl.l #4,\3 ; \3 = b0r0
or.l \2,\3 ; \3 = bjrz
move.l \3,(a5)+ ; write to tmp buffer
endc
endm
merge1 macro ; io in out tmp msk flg
; \1 \2 \3 \4 \5 \6
; \1 = abqr
; \2 = ijyz
move.l \5,\3 ; \3 = 0x0x
move.l \3,\4 ; \4 = 0x0x
and.l \1,\3 ; \3 = 0b0r
and.l \2,\4 ; \4 = 0j0z
eor.l \3,\1 ; \1 = a0q0
eor.l \4,\2 ; \2 = i0y0
lsr.l #1,\2 ; \2 = 0i0y
or.l \2,\1 ; \1 = aiqy
move.l \1,(a2) ; write to output plane
suba.l a5,a2 ; -plsiz
add.l \3,\3 ; \3 = b0r0
or.l \4,\3 ; \3 = bjrz
ifne \6
move.l \3,(a2) ; write to output plane
suba.l a5,a2 ; -plsiz
endc
endm
start: jmp next ; self-modified code here
next: movem.l d1/a0-a1/a6,-(sp)
; Relocate c2p so that firstsweep2 is at a quad-longword-aligned address.
; Firstsweep2 loop doesn't fit in '020/'030 cache unless it is exactly aligned.
; Speed penalty of misalignment is about 30%.
lea (firstsweep2,pc),a0
move.l a0,d0
and.w #%00001111,d0 ; relocate by -d0.w bytes
lea (c2p,pc),a0 ; a0 = src
movea.l a0,a1
sub.w d0,a1 ; a1 = dst
move.l a1,start+2 ; patch jmp
move.w #(end-c2p)/2-1,d0
loop: move.w (a0)+,(a1)+ ; relocate code loop
dbra d0,loop
move.l (4).w,a6 ; flush cache
jsr (_LVOCacheClearU,a6)
movem.l (sp)+,d1/a0-a1/a6
bra.b start ; restart
ds.w 8 ; space for relocation of c2p routine
; the real c2p routine starts here
c2p:
movem.l d2-d7/a2-a6,-(sp)
sub.w #24,sp ; space for temporary variables
; a0 = chunky buffer
; a1 = output area
; a2 = dirty list
; d1 = plsiz
; a5 = tmp buffer
move.l a1,(4,sp) ; save output address
move.l a2,(8,sp) ; save dirty list ptr
move.l d1,(12,sp) ; save plsiz
lsl.l #3,d1
movea.l a0,a1
adda.l d1,a1 ; a1 -> end of chunky buffer
sub.l (12,sp),d1
ifle depth-6
sub.l (12,sp),d1
sub.l (12,sp),d1
endc
ifle depth-4
sub.l (12,sp),d1
sub.l (12,sp),d1
endc
move.l d1,(16,sp) ; save 7*plsiz (or 5*plsiz) (or 3*plsiz)
move.l a5,(20,sp) ; save tmp buffer address
;; Sweep thru the whole chunky data once,
;; Performing 3 merge operations on it.
move.l #$00ff00ff,a3 ; load byte merge mask
move.l #$0f0f0f0f,a4 ; load nibble merge mask
; pass 1
firstsweep: tst.b (a2)+ ; does next 32 pixel unit need updating?
bne.b firstsweep3
adda.w #32,a0 ; skip 32 pixels on input/output
cmpa.l a0,a1
bne.b firstsweep
bra.w exit ; exit if no changes
; this becomes the first sweep's main loop after the first change is found
firstsweep2: tst.b (a2)+ ; does next 32 pixel unit need updating?
bne.b firstsweep3
adda.w #32,a0 ; skip 32 pixels on input
cmpa.l a0,a1
bne.b firstsweep2
bra.w secondsweep ; on to second sweep if changes
firstsweep3:
movem.l (a0)+,d0-d7 ; get 32 pixels in registers
; d0-7 = abcd efgh ijkl mnop qrst uvwx yzAB CDEF
wordmerge d0,d4,a6 ;d0/4 = abqr cdst
wordmerge d1,d5,a6 ;d1/5 = efuv ghwx
wordmerge d2,d6,a6 ;d2/6 = ijyz klAB
wordmerge d3,d7,a6 ;d3/7 = mnCD opEF
; temporarily save off some registers
movea.l d7,a6
move.l d6,(sp)
; pass 2
merge d0,d2,d6,d7,a3,8 ;d0/d6 = aiqy bjrz
merge d1,d3,d7,d2,a3,8 ;d1/d7 = emuc fnvD
; pass 3
merge4 d0,d1,d2,d3,a4,4 ;d0/d2 = ae74... ae30...
merge4 d6,d7,d3,d1,a4,4 ;d6/d3 = bf74... bf30...
; bring them back
move.l a6,d7
move.l (sp),d6
; pass 2
merge d4,d6,d0,d1,a3,8 ;d4/d0 = cksA dltB
merge d5,d7,d1,d6,a3,8 ;d5/d1 = gowE hpxF
; pass 3
merge4 d4,d5,d6,d7,a4,4 ;d4/d6 = cg74.. cg30..
merge4 d0,d1,d7,d5,a4,4 ;d0/d7 = dh74.. dh30..
cmpa.l a0,a1
bne.w firstsweep2 ; end of firstsweep, 250 bytes
; only just fits in instr cache
; (a0) ae74.. ae30.. bf74.. bf30.. cg74.. cg30.. dh74.. dh30..
secondsweep:
movea.l a5,a1 ; a1 -> end of tmp buffer
movea.l (4,sp),a2 ; a2 -> plane0
movea.l (8,sp),a6 ; a6 -> dirty list
movea.l (12,sp),a5 ; a5 = plsiz
adda.l (16,sp),a2 ; a2 -> plane7
movea.l (20,sp),a0 ; a0 -> tmp buffer
movea.l #$33333333,a3
movea.l #$55555555,a4
bra.b secondsweep2
secondsweep3: addq.l #4,a2 ; skip 32 pixels on output
secondsweep2: tst.b (a6)+ ; does next 32 pixel unit need updating?
beq.b secondsweep3
ifgt depth-4
movem.l (a0)+,d0-d6 ; read tmp buffer, not d7 yet
; save d5 temporarily
move.l d5,(sp)
;; pass 4
merge d0,d4,d5,d7,a3,2 ; d0/d5 = aceg76.. aceg54..
merge d2,d6,d7,d4,a3,2 ; d2/d7 = bdhf76.. bdhf54..
;; pass 5
ifgt depth-6
merge1 d0,d2,d4,d6,a4,1 ; d0/d4 = abcd7... abcd6...
endc
merge1 d5,d7,d6,d2,a4,1 ; d5/d6 = abcd5... abcd4...
; restore d5 and finally get d7
move.l (sp),d5
move.l (a0)+,d7
else
movem.l (a0)+,d1/d3/d5/d7 ; read tmp buf, depth 4 version
endc
;; pass 4
merge d1,d5,d4,d6,a3,2 ; d1/d4 = aceg32.. aceg10..
merge d3,d7,d6,d5,a3,2 ; d3/d6 = bdhf32.. bdhf10..
;; pass 5
merge1 d1,d3,d5,d7,a4,1 ; d1/d5 = abcd3... abcd2...
merge1 d4,d6,d7,d3,a4,0 ; d4/d7 = abcd1... abcd0...
move.l d7,(a2)+ ; plane 0
adda.l (16,sp),a2 ; +7*plsiz (or 5*plsiz) (or 3*plsiz)
cmp.l a0,a1
bne.w secondsweep2 ; end of secondsweep, 216 bytes
exit:
add.w #24,sp
movem.l (sp)+,d2-d7/a2-a6
rts
end:
end